When our member traveling between different states (or country), which fast food/restaurant is most popular ?
Note:
Member home location can be found in member profile. It is changing overtime. So, when matching transaction with member profile, we need match by both member Id and transaction date.
Restaurant and fast food having MCCcode: 5812 and 5814. These code are one column of transaction data.
Restaurant and fast food store location can be found in transcation data.
Find all fast food/restaurant transaction that happens in a different state compare with the state of bill( statement ) in that month.
Summarize the fast food/restaurant by their names.
trans1 <- read.csv('Data/April Vintage Transactions page 1.csv', stringsAsFactors=FALSE)
trans2 <- read.csv('Data/April Vintage Transactions page 2.csv', stringsAsFactors=FALSE)
trans2$DebtDimId <- as.character(trans2$DebtDimId)
trans3 <- read.csv('Data/April Vintage Transactions page 3.csv', stringsAsFactors=FALSE)
trans3$DebtDimId <- as.character(trans3$DebtDimId)
trans4 <- read.csv('Data/April Vintage Transactions page 4.csv', stringsAsFactors=FALSE)
trans4$DebtDimId <- as.character(trans4$DebtDimId)
trans.dat <- bind_rows( bind_rows(trans1,trans2),bind_rows(trans3,trans4) )
nrow(trans.dat)
## [1] 197271
head(trans.dat)
## DebtDimId TimeDimDt PremierTranCode PremierTranDescr
## 1 24577720 06/14/2013 101 Sale
## 2 24577720 06/14/2013 101 Sale
## 3 24579437 04/14/2013 101 Sale
## 4 24579437 04/14/2013 101 Sale
## 5 24579683 04/08/2013 101 Sale
## 6 24579683 04/08/2013 101 Sale
## FDRMerchDescr TransactionAmt MCCcode
## 1 TAG*TAGGED INC 866-217-8131 CA 10.00 8699
## 2 RALEY'S #236 OROVILLE CA 35.71 5411
## 3 CS*SANTANDER CONSUM 866-724-2743 CA 199.95 7399
## 4 FAST LANE # 4744 JACKSON MS 21.05 5542
## 5 BAR LOUIE GRAND RA GRAND RAPIDS MI 8.42 5812
## 6 TGI FRIDAYS #1590 KENTWOOD MI 4.00 5812
## MCCCategory
## 1 Professional Services and Membership Organizations
## 2 Retail Stores
## 3 Business Services
## 4 Automobiles/Vehicles
## 5 Miscellaneous Stores
## 6 Miscellaneous Stores
## MCCDescription
## 1 Membership Organizations not elsewhere classified
## 2 Grocery Stores, Supermarkets
## 3 Business Services not elsewhere classified
## 4 Automated Fuel Dispenser
## 5 Eating Places, Restaurants
## 6 Eating Places, Restaurants
summary(trans.dat)
## DebtDimId TimeDimDt PremierTranCode PremierTranDescr
## Length:197271 Length:197271 Min. :101.0 Length:197271
## Class :character Class :character 1st Qu.:101.0 Class :character
## Mode :character Mode :character Median :101.0 Mode :character
## Mean :102.5
## 3rd Qu.:101.0
## Max. :305.0
## NA's :1
## FDRMerchDescr TransactionAmt MCCcode MCCCategory
## Length:197271 Min. :-4132.08 Min. : 0 Length:197271
## Class :character 1st Qu.: 7.00 1st Qu.:5411 Class :character
## Mode :character Median : 15.25 Median :5542 Mode :character
## Mean : 27.82 Mean :5740
## 3rd Qu.: 32.52 3rd Qu.:5814
## Max. : 4132.08 Max. :9754
## NA's :2 NA's :2
## MCCDescription
## Length:197271
## Class :character
## Mode :character
##
##
##
##
profile1 <- read.csv('Data/April Vintage 2013.csv', stringsAsFactors = FALSE)
profile2 <- read.csv('Data/April Vintage 2013 _ page 2.csv', stringsAsFactors = FALSE)
prof.dat <- bind_rows(profile1, profile2)
nrow(prof.dat)
## [1] 173253
head(prof.dat)
## DebtDimId AccountDimId FDROpenDt DatePrevStmt StatementDt MOB ExtStatus
## 1 24508718 65089656 04/02/2013 NULL 04/08/2013 0
## 2 24508718 65089656 04/02/2013 04/08/2013 05/09/2013 1
## 3 24508718 65089656 04/02/2013 05/09/2013 06/07/2013 2
## 4 24508718 65089656 04/02/2013 06/07/2013 07/09/2013 3
## 5 24508718 65089656 04/02/2013 07/09/2013 08/09/2013 4
## 6 24508718 65089656 04/02/2013 08/09/2013 09/08/2013 5
## IntStatus DaysDeliq ActualMinPay OverlimitAmount DisplayMinPay
## 1 0 30 0 30
## 2 0 30 0 30
## 3 0 30 0 30
## 4 0 30 0 30
## 5 0 30 0 30
## 6 0 30 0 30
## OpeningBalance BillLateCharge EndingBalance CreditLimit TotalNetPayments
## 1 0.00 0 180.00 300 0
## 2 180.00 0 154.50 300 30
## 3 154.50 0 159.74 300 30
## 4 159.74 0 267.92 300 30
## 5 267.92 0 245.81 300 30
## 6 245.81 0 223.06 300 30
## TotalNetPurchaseAndCash TotalFeesBilled Concessions QuarterlyFicoScore
## 1 0.00 0.00 0 0
## 2 0.00 4.50 0 536
## 3 19.58 15.66 0 536
## 4 130.00 8.18 0 536
## 5 0.00 7.89 0 536
## 6 0.00 7.25 0 515
## BehavScore City ZState State ZipCode X24856314
## 1 11 NORTH LAS VEGAS NV NV 89031 NA
## 2 636 NORTH LAS VEGAS NV NV 89031 NA
## 3 639 NORTH LAS VEGAS NV NV 89031 NA
## 4 644 NORTH LAS VEGAS NV NV 89031 NA
## 5 648 NORTH LAS VEGAS NV NV 89031 NA
## 6 657 NORTH LAS VEGAS NV NV 89031 NA
## X65742793 X04.18.2013 NULL. X04.21.2013 X0 X X.1 X0.1 X30 X0.2 X30.1
## 1 NA <NA> <NA> <NA> NA <NA> <NA> NA NA NA NA
## 2 NA <NA> <NA> <NA> NA <NA> <NA> NA NA NA NA
## 3 NA <NA> <NA> <NA> NA <NA> <NA> NA NA NA NA
## 4 NA <NA> <NA> <NA> NA <NA> <NA> NA NA NA NA
## 5 NA <NA> <NA> <NA> NA <NA> <NA> NA NA NA NA
## 6 NA <NA> <NA> <NA> NA <NA> <NA> NA NA NA NA
## X0.3 X0.4 X100 X400 X0.5 X0.6 X0.7 X0.8 X0.9 X11 OXNARD CA CA.1 X93032
## 1 NA NA NA NA NA NA NA NA NA NA <NA> <NA> <NA> NA
## 2 NA NA NA NA NA NA NA NA NA NA <NA> <NA> <NA> NA
## 3 NA NA NA NA NA NA NA NA NA NA <NA> <NA> <NA> NA
## 4 NA NA NA NA NA NA NA NA NA NA <NA> <NA> <NA> NA
## 5 NA NA NA NA NA NA NA NA NA NA <NA> <NA> <NA> NA
## 6 NA NA NA NA NA NA NA NA NA NA <NA> <NA> <NA> NA
summary(prof.dat)
## DebtDimId AccountDimId FDROpenDt
## Min. :24508718 Min. :65089656 Length:173253
## 1st Qu.:24797902 1st Qu.:65639131 Class :character
## Median :24817093 Median :65675029 Mode :character
## Mean :24806966 Mean :65711462
## 3rd Qu.:24836872 3rd Qu.:65709823
## Max. :24856301 Max. :69639949
## NA's :80623 NA's :80623
## DatePrevStmt StatementDt MOB ExtStatus
## Length:173253 Length:173253 Min. : 0.00 Length:173253
## Class :character Class :character 1st Qu.: 3.00 Class :character
## Mode :character Mode :character Median : 7.00 Mode :character
## Mean : 7.27
## 3rd Qu.:12.00
## Max. :17.00
## NA's :80623
## IntStatus DaysDeliq ActualMinPay OverlimitAmount
## Length:173253 Min. : 0.00 Min. : 0.0 Min. : 0.00
## Class :character 1st Qu.: 0.00 1st Qu.: 30.0 1st Qu.: 0.00
## Mode :character Median : 0.00 Median : 30.0 Median : 0.00
## Mean : 25.54 Mean : 63.2 Mean : 28.39
## 3rd Qu.: 30.00 3rd Qu.: 60.0 3rd Qu.: 30.19
## Max. :390.00 Max. :1237.0 Max. :2199.63
## NA's :80623 NA's :80623 NA's :80623
## DisplayMinPay OpeningBalance BillLateCharge EndingBalance
## Min. :-401.01 Min. :-500.6 Min. : 0.00 Min. :-500.6
## 1st Qu.: 30.00 1st Qu.: 175.0 1st Qu.: 0.00 1st Qu.: 203.2
## Median : 33.71 Median : 304.6 Median : 0.00 Median : 310.6
## Mean : 84.99 Mean : 333.3 Mean : 8.55 Mean : 351.4
## 3rd Qu.: 90.00 3rd Qu.: 441.9 3rd Qu.:25.00 3rd Qu.: 455.9
## Max. :2374.63 Max. :2499.6 Max. :35.00 Max. :2499.6
## NA's :80623 NA's :80623 NA's :80623 NA's :80623
## CreditLimit TotalNetPayments TotalNetPurchaseAndCash
## Min. : 0.0 Min. :-817.77 Min. :-2007.74
## 1st Qu.:300.0 1st Qu.: 0.00 1st Qu.: 0.00
## Median :300.0 Median : 30.00 Median : 0.00
## Mean :428.7 Mean : 60.25 Mean : 59.46
## 3rd Qu.:700.0 3rd Qu.: 68.97 3rd Qu.: 63.88
## Max. :900.0 Max. :2770.07 Max. : 2488.63
## NA's :80623 NA's :80623 NA's :80623
## TotalFeesBilled Concessions QuarterlyFicoScore BehavScore
## Min. :-359.35 Min. :-607.87 Min. : 0.0 Min. : 11.0
## 1st Qu.: 5.90 1st Qu.: 0.00 1st Qu.:513.0 1st Qu.:556.0
## Median : 15.05 Median : 0.00 Median :576.0 Median :624.0
## Mean : 22.31 Mean : -0.74 Mean :521.2 Mean :532.5
## 3rd Qu.: 38.55 3rd Qu.: 0.00 3rd Qu.:626.0 3rd Qu.:651.0
## Max. : 190.52 Max. : 414.31 Max. :798.0 Max. :717.0
## NA's :80623 NA's :80623 NA's :80623 NA's :80623
## City ZState State ZipCode
## Length:173253 Length:173253 Length:173253 Min. : 605
## Class :character Class :character Class :character 1st Qu.:29102
## Mode :character Mode :character Mode :character Median :46224
## Mean :51290
## 3rd Qu.:77703
## Max. :99775
## NA's :80623
## X24856314 X65742793 X04.18.2013
## Min. :24856314 Min. :65512291 Length:173253
## 1st Qu.:24865229 1st Qu.:65753728 Class :character
## Median :24876408 Median :65772563 Mode :character
## Mean :24876981 Mean :65821277
## 3rd Qu.:24887693 3rd Qu.:65793608
## Max. :24901159 Max. :69687240
## NA's :132942 NA's :132942
## NULL. X04.21.2013 X0 X
## Length:173253 Length:173253 Min. : 0.00 Length:173253
## Class :character Class :character 1st Qu.: 3.00 Class :character
## Mode :character Mode :character Median : 7.00 Mode :character
## Mean : 7.82
## 3rd Qu.:12.00
## Max. :17.00
## NA's :132942
## X.1 X0.1 X30 X0.2
## Length:173253 Min. : 0.00 Min. : 0.00 Min. : 0.00
## Class :character 1st Qu.: 0.00 1st Qu.: 30.00 1st Qu.: 0.00
## Mode :character Median : 0.00 Median : 38.00 Median : 0.00
## Mean : 20.16 Mean : 63.86 Mean : 25.18
## 3rd Qu.: 30.00 3rd Qu.: 60.00 3rd Qu.: 15.25
## Max. :390.00 Max. :1219.00 Max. :1430.43
## NA's :132942 NA's :132942 NA's :132942
## X30.1 X0.3 X0.4 X100
## Min. : 0.00 Min. :-603.5 Min. : 0.00 Min. :-603.5
## 1st Qu.: 30.00 1st Qu.: 175.0 1st Qu.: 0.00 1st Qu.: 219.8
## Median : 39.00 Median : 358.7 Median : 0.00 Median : 378.7
## Mean : 84.01 Mean : 391.7 Mean : 7.87 Mean : 415.6
## 3rd Qu.: 72.77 3rd Qu.: 645.1 3rd Qu.:25.00 3rd Qu.: 656.5
## Max. :2130.43 Max. :2130.4 Max. :35.00 Max. :2130.4
## NA's :132942 NA's :132942 NA's :132942 NA's :132942
## X400 X0.5 X0.6 X0.7
## Min. : 0.0 Min. :-885.60 Min. :-500.00 Min. :-277.17
## 1st Qu.:300.0 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 6.84
## Median :500.0 Median : 40.00 Median : 0.00 Median : 19.22
## Mean :520.3 Mean : 66.19 Mean : 66.05 Mean : 24.20
## 3rd Qu.:700.0 3rd Qu.: 75.00 3rd Qu.: 68.97 3rd Qu.: 38.88
## Max. :900.0 Max. :1994.14 Max. :1683.99 Max. : 208.95
## NA's :132942 NA's :132942 NA's :132942 NA's :132942
## X0.8 X0.9 X11 OXNARD
## Min. :-379.71 Min. : 0.0 Min. : 11.0 Length:173253
## 1st Qu.: 0.00 1st Qu.:538.0 1st Qu.:586.0 Class :character
## Median : 0.00 Median :598.0 Median :633.0 Mode :character
## Mean : -0.73 Mean :553.4 Mean :552.4
## 3rd Qu.: 0.00 3rd Qu.:640.0 3rd Qu.:655.0
## Max. : 246.40 Max. :791.0 Max. :715.0
## NA's :132942 NA's :132942 NA's :132942
## CA CA.1 X93032
## Length:173253 Length:173253 Min. : 683
## Class :character Class :character 1st Qu.:27616
## Mode :character Mode :character Median :44504
## Mean :49464
## 3rd Qu.:77063
## Max. :99802
## NA's :132942
rm(trans1, trans2, trans3, trans4, profile1, profile2)
remove record without DebtDimId data
temp <- prof.dat[-which(is.na(prof.dat$DebtDimId)),]
Replace NULL DatePrevStmt by FDROpenDt if it is the first record. Replace NULL DatePrevStmt by StatementDt - 30 if it is not the first record
idx <- which(( is.na(temp$DatePrevStmt) | temp$DatePrevStmt == 'NULL') & temp$MOB == 0)
temp[idx,]$DatePrevStmt <- temp[idx,]$FDROpenDt
idx <- which(is.na(temp$DatePrevStmt) | temp$DatePrevStmt == 'NULL' )
temp[idx,]$DatePrevStmt <- format( as.Date(temp[idx,]$StatementDt, "%m/%d/%Y") - 30, format="%m/%d/%Y")
prof.PrevStmt.fix <- temp
rm(temp)
Group user profile data by : Id, State
temp <- prof.PrevStmt.fix %>%
group_by(DebtDimId, State) %>%
summarise(StartDay = min(as.Date(DatePrevStmt, "%m/%d/%Y" )),
EndDay = max(as.Date(StatementDt, "%m/%d/%Y"))) %>%
select(one_of(c('DebtDimId','State', 'StartDay', 'EndDay')))
length(unique(prof.dat$DebtDimId))
## [1] 7039
nrow(temp)
## [1] 7271
tbl.Member.HomeState <- temp
rm(temp, prof.PrevStmt.fix)
All transaction with MCCcode: 5812 or 5814
temp <- trans.dat[which(trans.dat$MCCcode==5812 | trans.dat$MCCcode==5814),]
Remove transactions in which FDRMerchDescr contains phone number
phone.reg.pattern = '\\s*(?:\\+?(\\d{1,3}))?[- (]*(\\d{3})[- )]*(\\d{3})[- ]*(\\d{4})(?: *[x/#]{1}(\\d+))?\\s*'
idx <- grep(phone.reg.pattern, temp$FDRMerchDescr)
if(length(idx) != 0)
{
temp <- temp[-idx,]
}
rm(phone.reg.pattern, idx)
Get State code from FDRMerchDescr
state.reg.pattern = '[[:alpha:]]{2}$'
store.loc <- str_extract(temp$FDRMerchDescr, state.reg.pattern)
idx.incorrect.loc <- which(! (store.loc %in% state.abb))
store.loc[idx.incorrect.loc] <- -1
rm(state.reg.pattern, idx.incorrect.loc)
Store Name 1: FDRMerchDescr - Remove city and state
store.name <- temp$FDRMerchDescr
tbDelete.reg.pattern = '[[:space:]]*[[:alpha:]]{2,} [[:alpha:]]{2}$'
store.name <- gsub(tbDelete.reg.pattern, '', store.name)
Store Name 2: Remove string after two space (or more)
tbDelete.reg.pattern = '[[:space:]]{2,}[[:print:]]*$'
store.name <- gsub(tbDelete.reg.pattern, '', store.name)
Store Name 3: Remove string after digit
tbDelete.reg.pattern = '[[:space:]]*[[:alpha:]]*\\d+[[:print:]]*$'
store.name <- gsub(tbDelete.reg.pattern, '', store.name)
Store Name 4: Remove string after #
tbDelete.reg.pattern = '[[:space:]]*#[[:print:]]*$'
store.name <- gsub(tbDelete.reg.pattern, '', store.name)
Store Name 5: Remove string after @
tbDelete.reg.pattern = '[[:space:]]*@[[:print:]]*$'
store.name <- gsub(tbDelete.reg.pattern, '', store.name)
Store Name 6: Remove string after -
tbDelete.reg.pattern = '[[:space:]]*-[[:print:]]*$'
store.name <- gsub(tbDelete.reg.pattern, '', store.name)
Store Name 7: Remove all non alpha beta charactor
This can solve the difference between ‘Wendy’s’ and ‘Wendy s’
tbDelete.reg.pattern = '[[:punct:][:space:]]*'
store.name <- gsub(tbDelete.reg.pattern, '', store.name)
Bind member id, transaction day, store name, store loc
member.id <- temp$DebtDimId
transaction.Day <- temp$TimeDimDt
tbl.clean.trans <- as.data.frame(cbind( cbind(member.id, transaction.Day), cbind(store.name, store.loc) ))
rm(member.id, transaction.Day, store.name, store.loc)
HomeState <- NULL
MissingList <- NULL
DuplicateList <- NULL
for(i in 1:nrow(tbl.clean.trans)){
transId <- tbl.clean.trans$member.id[i]
transDay <- as.Date(tbl.clean.trans$transaction.Day[i], "%m/%d/%Y")
# Id.Idx <- which(tbl.Member.HomeState$DebtDimId == transId)
# StartDay.Idx <- which(tbl.Member.HomeState$StartDay <= transDay)
# EndDay.Idx <- which(tbl.Member.HomeState$EndDay >= transDay)
#
# s <-Id.Idx[which(Id.Idx %in% StartDay.Idx &
# Id.Idx %in% EndDay.Idx)]
Hs.Idx <- which( tbl.Member.HomeState$DebtDimId == transId &
tbl.Member.HomeState$StartDay <= transDay &
tbl.Member.HomeState$EndDay >= transDay)
if(is.null(Hs.Idx) | length(Hs.Idx) == 0){
MissingList <- c(MissingList, i)
HomeState <- c(HomeState, 'MISS')
next
}
if(length(Hs.Idx) > 1){
DuplicateList <- c(DuplicateList, i)
HomeState <- c(HomeState, 'DUPLICATE')
next
}
HomeState <- c(HomeState, tbl.Member.HomeState$State[Hs.Idx])
}
length(MissingList)
## [1] 11859
length(DuplicateList)
## [1] 29
length(HomeState)
## [1] 40453
tmp.dat <- cbind(tbl.clean.trans, HomeState)
trans.idx <- which( tmp.dat$HomeState != 'MISS' &
tmp.dat$HomeState != 'DUPLICATE' &
# tmp.dat$store.loc != "NULL" &
as.character(tmp.dat$HomeState) != as.character(tmp.dat$store.loc))
rm(tmp.dat)
All stores show up:
visited.stores <- tbl.clean.trans[trans.idx, ]
plot_ly(visited.stores, labels = ~store.name, type='pie')
Remove the store names which shows less than ten times.
tmp.tbl.store.shows.at.least.ten <- visited.stores %>%
group_by(store.name) %>%
summarise(store.show.feq = n()) %>%
filter(store.show.feq >= 10) %>%
select(store.name, store.show.feq )
plot_ly(tmp.tbl.store.shows.at.least.ten, labels = ~store.name, values= ~store.show.feq,textinfo = 'label+percent', type='pie')
MC Donalds is the most popular restaunt when people traveling out their state.